This project is just for fun

Using a movies reviews by users dataset provided from https://grouplens.org/datasets/movielens/latest/ We will try (by analyzing the data) to predict the most profitable movie we can produce to have a lot of mony

The following 7 chunks of code will be the booring cleaning and preproccessing the data But basicly what we will do is to find the best genre from the rates and views, then take the best movies in this genre and see what tags they have

first of all the libraries we will use

library(reshape2)
library(ggplot2)
library(hrbrthemes)
library(stringr)
library(dplyr)
library(wordcloud2) 
options(scipen = 999)

Reading the Data

The data provided by 3 csv files which can know more about it from but here is a brief view from each table

movies = read.csv("Data/movies.csv")
ratings = read.csv("Data/ratings.csv")
tags = read.csv("Data/tags.csv")
head(movies)
##   movieId                              title
## 1       1                   Toy Story (1995)
## 2       2                     Jumanji (1995)
## 3       3            Grumpier Old Men (1995)
## 4       4           Waiting to Exhale (1995)
## 5       5 Father of the Bride Part II (1995)
## 6       6                        Heat (1995)
##                                        genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2                  Adventure|Children|Fantasy
## 3                              Comedy|Romance
## 4                        Comedy|Drama|Romance
## 5                                      Comedy
## 6                       Action|Crime|Thriller
head(ratings)
##   userId movieId rating timestamp
## 1      1       1      4 964982703
## 2      1       3      4 964981247
## 3      1       6      4 964982224
## 4      1      47      5 964983815
## 5      1      50      5 964982931
## 6      1      70      3 964982400
head(tags)
##   userId movieId             tag  timestamp
## 1      2   60756           funny 1445714994
## 2      2   60756 Highly quotable 1445714996
## 3      2   60756    will ferrell 1445714992
## 4      2   89774    Boxing story 1445715207
## 5      2   89774             MMA 1445715200
## 6      2   89774       Tom Hardy 1445715205

Classing the data

assigning data columns classes to the right classes

#Classing data
ratings$timestamp = as.POSIXct(ratings$timestamp , origin = '1970-1-1' , tz = "UTC") 
tags$timestamp = as.POSIXct(tags$timestamp , origin = '1970-1-1' , tz = "UTC") 
tags$tag = as.character(tags$tag)
tags$movieId = as.character(tags$movieId)
movies$title = as.character(movies$title)
movies$genres = as.character(movies$genres)

#for future use 
movies2 = movies

Cleaning the movies data

Seperating the movie title from the year

#cleaning data
year = vector()
for(i in 1:nrow(movies))
{
  length =  str_length(movies[i,"title"] ) 
  year = c(year, 
           substr(as.character(movies[i  ,"title"]) ,
                  length - 4 , length -1
           )  
  )
  movies[i,"title"] = substr(movies[i, "title"], 1 ,
                             length - 7)
}
movies$year  = year 
head(movies)
##   movieId                       title
## 1       1                   Toy Story
## 2       2                     Jumanji
## 3       3            Grumpier Old Men
## 4       4           Waiting to Exhale
## 5       5 Father of the Bride Part II
## 6       6                        Heat
##                                        genres year
## 1 Adventure|Animation|Children|Comedy|Fantasy 1995
## 2                  Adventure|Children|Fantasy 1995
## 3                              Comedy|Romance 1995
## 4                        Comedy|Drama|Romance 1995
## 5                                      Comedy 1995
## 6                       Action|Crime|Thriller 1995

creating a new data frame to contain all the genres with the total rate , views for each one

#creating a new data frame to contain all the genres with the total rate , views for each one
genres = data.frame(genre = "", 
                    rate =0 , 
                    views = 0 , 
                    year = 0
                    )

Looping through each movie’s genres and add 1 to views of that genre and add the rate of the movie to the rate of the genre

#
for(i in 1:nrow(ratings) )
{
  id = ratings[i , "movieId"] 
  rate = ratings[i, "rating"]
  year = format(as.Date(as.POSIXct( ratings[i , "timestamp"] , tz = "UTC")), "%Y") 
  g = strsplit(  movies[movies$movieId==id , "genres"]  , "\\|")
  for(j in g[[1]] )
  {
   
    if(sum(genres$year==year)>=1 & sum(genres$genre == j))
    {
      genres[genres$genre == j , "views"] = genres[genres$genre == j , "views"] + 1
      genres[genres$genre == j  , "rate"] = genres[genres$genre == j , "rate"] + rate
    }
    else
    {
      r = data.frame(genre = j , rate = rate , views = 1, year = year)
      genres = rbind(genres , r)
    }
  }
}

The “genres” dataframe after some cleaning

#erasing The temp row and filtering the no genres 
genres = genres[2:nrow(genres), ]
genres = genres %>% filter(genre != "(no genres listed)")
#show head of genres
head(genres)
##       genre     rate views year
## 1 Adventure  84752.5 24156 2000
## 2 Animation  25366.0  6988 2000
## 3  Children  31426.5  9208 2000
## 4    Comedy 132167.5 39047 2000
## 5   Fantasy  41312.5 11834 2000
## 6   Romance  63552.0 18124 2000

———————————————————————————-

———————————————————————————-

Now let’s say that we want to determine our movie’s genre. Maybe we want to know which genre has the best rate, because that means people will love our movie, right ? Let’s see the all the genres rates

# summing rows with the same genre together by summing the rate and views
genres_rates = as.data.frame(genres_rates %>% group_by(genre) %>% summarise_all(sum) ) 
#calculating the rate by : sum of rates / number of reviews
genres_rates$rate = genres_rates$rate / genres_rates$views
ggplot(genres_rates, aes(x= reorder(genre , rate ), y=rate , fill = genre)) + 
  geom_bar(stat = "identity") +
  labs(x = "Genre" , y= "Rate"  )+
  ggtitle("Comparing with the rate of each genre accorfing to the rate itself")+
  theme_ft_rc() +
  theme(axis.text.x = element_text(angle = 90) )
genres-rates-rate

genres-rates-rate

But who wants the people to love his movie !! , we’re talking money$$$ here So let’s reorder the genres by the number of views

ggplot(genres_rates, aes(x= reorder(genre , views ), y=rate , fill = genre)) + 
  geom_bar(stat = "identity") +
  labs(x = "Genre" , y= "Rate"  )+
  ggtitle("Comparing with the rate of each genre accorfing to the views")+
  theme_ft_rc() +
  theme(axis.text.x = element_text(angle = 90) )
genres-rates-views

genres-rates-views

Okay okay let’s calm down for a moment, because the diffirence in rates are very little, let’s just compare all genres by the number of views

# plot genre and views 
ggplot(genres_views, aes(x= reorder(genre , views ), y=views , fill = genre)) + 
  geom_bar(stat = "identity") +
  labs(x = "Genre" , y= "Views"  )+
  ggtitle("Comparing with the views of each genre")+
  theme_ft_rc() +
  theme(axis.text.x = element_text(angle = 90) )
genres-views

genres-views

Yes we want money, but we also need people to like our movie so maybe they will watch it again and again and. I’m saying this because “Comedy” is more in views numbers, but “Drama” is so close from it and “Drama” also ahead of “Comedy” in rate

So maybe we need to consider making our movie’s genre “Drama”

———————————————————————————-

———————————————————————————-

Okay Now our movie main genre is “Drama”, but what’s the main keywords it will include ?

luckly we have a tag table includes the movie id and the tag, so all we need to do is to get all the movies under “Drama” genre and see what tags they have

let’s collect the ids from the movies table

# collecting "Drama" movies ids by checking if the movie has the word "Drama" in its genre
ids = vector()
for(i in 1:nrow(movies2))
{
  sp = strsplit(movies2[i , "genres"] , "\\|")[[1]]
  if("Drama" %in% sp)
  ids = c(ids , movies2[i , "movieId"])
}

We want the best “Drama” movies by rate, so we will filter the ratings table to take : - movie id , movie rating columns - movies which has id in our collected ids so we know it’s “Drama” - group that by the movie, because there was more than one rating for each movie - take the mean of the grouped movie rates - keep only movies have 5 start rate

drama = ratings[, names(ratings) %in% c("movieId" , "rating")]
drama = as.data.frame(drama %>% filter(movieId %in% ids ) 
                      %>% group_by(movieId)
                      %>% summarise_all(mean)
                        )

drama = drama[order( - drama$rating) , ]
drama = drama[drama$rating == 5, ]
head(drama)
##     movieId rating
## 27       53      5
## 61      148      5
## 222     495      5
## 223     496      5
## 426    1140      5
## 428    1151      5

Now let’s Store the choosen movies titles to plot them

temp = data.frame(movie = movies2[movies2$movieId %in% drama$movieId , "title"] )
temp$freq = round(runif(nrow(temp) , 1 , 20))

And hereis a word cloud of some of the choosen movies

wordcloud2( temp )
movies-wordcloud

movies-wordcloud

Now let’s take the tags the have

drama$movieId = as.character(drama$movieId)
filt = c("In Netflix queue" , "free to download" )
tags = tags %>% filter(movieId %in% drama$movieId & ! tag %in% filt )
tags = tags[order(tags$timestamp) , ]
tags = tags[1 :nrow(tags), "tag"]
temp = data.frame(word = tags)
head(temp)
##                word
## 1           England
## 2       imagination
## 3 social commentary
## 4        creativity
## 5          dystopia
## 6        claymation

and plot them

temp$freq = round(runif(nrow(temp) , 1 ,10))
wordcloud2( temp )
tags-wordcloud

tags-wordcloud

So we finally have our movie, it’ll be an atmospheric, no dialogue, dystopian, harsh, disturbing and bleak gritty movie involve a story of creaetivity and imagination that happens or involves “England”

Hmmm, Not very much the expected…. but I guess the internet has a very weird taste

We made it, We have our movie, we will be millionaires …. The only problem is we need money first to produce the movie :D